import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from datasets import load_dataset, Dataset
import json
import datasets

def create_code_review_dataset(test_size=0.1):
    # 加载基础数据集
    alpaca_dataset = load_dataset("sahil2801/CodeAlpaca-20k")

    # 合成数据生成
    synthetic_data = []
    with open("./data/python_code_review_dataset.jsonl") as f:
        for line in f:
            data = json.loads(line)
            # 确保所有字段都是字符串类型
            synthetic_data.append({
                "instruction": str(data.get("instruction", "")),
                "input": str(data.get("input", "")),
                "output": str(data.get("output", ""))
            })


    alpaca_data = alpaca_dataset.map(lambda x: {
        "instruction": x["instruction"],
        "input": x["input"],
        "output": x["output"]
    })

    # 合并数据集
    synthetic_dataset = Dataset.from_list(synthetic_data)
    full_dataset = datasets.concatenate_datasets([synthetic_dataset, alpaca_data["train"]])
    full_dataset = full_dataset.train_test_split(test_size=test_size)
    # 打印数据集信息
    print(f"训练集大小: {len(full_dataset['train'])} 样本")
    print(f"测试集大小: {len(full_dataset['test'])} 样本")
    return full_dataset

if __name__ == "__main__":
    create_code_review_dataset()